practice: data cleaning¶
In [2]:
# import my library /تحميل المكتبات التي سوف يتم العمل بها
import pandas as pd
import numpy as np
import plotly.express as px
from matplotlib import pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_rows = None
pd.options.display.max_columns =None
In [3]:
#wrangle function / انشاء فانكشن لتنظيف البيانات
def wrangle(filename, encoding = None, dropna_columns_name=[None] ,rename_colums =None ,date_column=None,remove_duplicated_rows_columnsname=None ):
#loding Data from csv file
df=pd.read_csv(filename ,encoding = encoding)
# delet non rows
df.dropna(subset=dropna_columns_name ,inplace=True)
#Rename columns= {old:new}
df.rename(columns=rename_colums, inplace=True)
#chang orderdate and id colimns type to data integer
df[date_column] =pd.to_datetime(df[date_column])
df= df.drop_duplicates(subset=remove_duplicated_rows_columnsname)
#strip and lowercase columns names
df.columns = df.columns.str.strip().str.lower()
return df
Issues:¶
- Delete orders with NaN values
- Rename
City.1, Country.1, Phon.1Columns - Change
OrderDateType into Date - Change ID Columns, and OrderNumber column to integer
In [5]:
# run my function
orders = wrangle(filename = "all_data.csv" ,encoding="iso-8859-1" ,dropna_columns_name=["OrderID"],rename_colums={"City.1":"suppcity","Country.1":"suppCountry","Phone.1":"suppphone"},date_column="OrderDate",remove_duplicated_rows_columnsname=["OrderID"])
orders.info()
<class 'pandas.core.frame.DataFrame'> Index: 830 entries, 0 to 2130 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 830 non-null float64 1 firstname 830 non-null object 2 lastname 830 non-null object 3 city 830 non-null object 4 country 830 non-null object 5 phone 830 non-null object 6 orderid 830 non-null float64 7 orderdate 830 non-null datetime64[ns] 8 ordernumber 830 non-null float64 9 totalamount 830 non-null float64 10 productid 830 non-null float64 11 productname 830 non-null object 12 unitprice 830 non-null float64 13 package 830 non-null object 14 isdiscontinued 830 non-null object 15 supplierid 830 non-null float64 16 companyname 830 non-null object 17 contactname 830 non-null object 18 suppcity 830 non-null object 19 suppcountry 830 non-null object 20 suppphone 830 non-null object 21 fax 318 non-null object dtypes: datetime64[ns](1), float64(7), object(14) memory usage: 149.1+ KB
In [ ]:
In [6]:
# Change ID Columns, and OrderNumber column to integer
for i in orders.columns:
if "id" in i or "ordernumber" in i:
orders[i] = orders[i].astype(int)
orders=orders[["customerid","firstname","lastname","city","country","orderid","orderdate","ordernumber","totalamount"]]
orders.info()
<class 'pandas.core.frame.DataFrame'> Index: 830 entries, 0 to 2130 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 830 non-null int32 1 firstname 830 non-null object 2 lastname 830 non-null object 3 city 830 non-null object 4 country 830 non-null object 5 orderid 830 non-null int32 6 orderdate 830 non-null datetime64[ns] 7 ordernumber 830 non-null int32 8 totalamount 830 non-null float64 dtypes: datetime64[ns](1), float64(1), int32(3), object(4) memory usage: 55.1+ KB
In [8]:
# quantitative data
# describe totalamount column
orders.totalamount.describe()
Out[8]:
count 830.00 mean 1,631.88 std 1,990.61 min 12.50 25% 480.00 50% 1,015.90 75% 2,028.65 max 17,250.00 Name: totalamount, dtype: float64
In [9]:
orders.head(10)
Out[9]:
| customerid | firstname | lastname | city | country | orderid | orderdate | ordernumber | totalamount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 85 | Paul | Henriot | Reims | France | 1 | 2012-07-04 | 542378 | 440.00 |
| 3 | 79 | Karin | Josephs | Münster | Germany | 2 | 2012-07-05 | 542379 | 1,863.40 |
| 5 | 34 | Mario | Pontes | Rio de Janeiro | Brazil | 3 | 2012-07-08 | 542380 | 1,813.00 |
| 8 | 84 | Mary | Saveley | Lyon | France | 4 | 2012-07-08 | 542381 | 670.80 |
| 11 | 76 | Pascale | Cartrain | Charleroi | Belgium | 5 | 2012-07-09 | 542382 | 3,730.00 |
| 14 | 34 | Mario | Pontes | Rio de Janeiro | Brazil | 6 | 2012-07-10 | 542383 | 1,444.80 |
| 17 | 14 | Yang | Wang | Bern | Switzerland | 7 | 2012-07-11 | 542384 | 625.20 |
| 20 | 68 | Michael | Holz | Genève | Switzerland | 8 | 2012-07-12 | 542385 | 2,490.50 |
| 24 | 88 | Paula | Parente | Resende | Brazil | 9 | 2012-07-15 | 542386 | 517.80 |
| 26 | 35 | Carlos | Hern?ndez | San Crist?bal | Venezuela | 10 | 2012-07-16 | 542387 | 1,119.90 |
In [11]:
# We need to know why mean > median, using histogram to know the shape of data
sns.histplot(data = orders , x='totalamount',kde=True)
#---------------------------------
plt.title("distribution of totalamount of invoices")
plt.xlabel("totalamount")
plt.ylabel("frequercy")
#---------------------------------
plt.axvline(orders.totalamount.mean() ,color='red' )
plt.axvline(orders.totalamount.median() , color="black")
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
Out[11]:
<matplotlib.lines.Line2D at 0x19496a38d50>
In [12]:
min = orders.totalamount.min()
q1= orders.totalamount.quantile(.25)
q2= orders.totalamount.median()
q3= orders.totalamount.quantile(.75)
iqr= q3-q1
lb=q1-(1.5*iqr)
ub=q3+(1.5*iqr)
In [13]:
print("min=" , min , "q1=" , q1 ,"q3=" ,"q2=",q2, q3 , "iqr= ",iqr ,"lb=",lb , "ub =",ub)
min= 12.5 q1= 480.0 q3= q2= 1015.9 2028.65 iqr= 1548.65 lb= -1842.9750000000004 ub = 4351.625
In [14]:
orders.select_dtypes('object').nunique()
Out[14]:
firstname 84 lastname 88 city 69 country 21 dtype: int64
In [31]:
# Boxplot to know the 5 number summary
px.box(data_frame=orders , x='totalamount')
In [33]:
orders=orders[orders.totalamount < 15000]
In [35]:
px.box(data_frame=orders , x='totalamount')
In [38]:
px.bar(data_frame=orders.city.value_counts() , title="Distribution of cityes")
In [40]:
px.bar(data_frame=orders.country.value_counts() , title="Distribution of countrys")
In [42]:
px.pie(data_frame=orders, names="country")
through the preliminary analysis , we see that london is the city that has largest share of lagest share of purchasing our products with 46 invoices ,and that the united states and germany are the most purchasing countries for our products
نري من خلال التحليل الاول للبيانات ان لاندن هيا التي تملك الحصه الاكبر من الشراء بواقع 46 فاتوره ,وان الويلايت المتحده و المانيا هم اكثر الدول شراء لمنتجات بواقع 121 فتوره بنسبه 14.6% من اجمالي مبيعاتنا
In [44]:
orders.head(5)
Out[44]:
| customerid | firstname | lastname | city | country | orderid | orderdate | ordernumber | totalamount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 85 | Paul | Henriot | Reims | France | 1 | 2012-07-04 | 542378 | 440.00 |
| 3 | 79 | Karin | Josephs | Münster | Germany | 2 | 2012-07-05 | 542379 | 1,863.40 |
| 5 | 34 | Mario | Pontes | Rio de Janeiro | Brazil | 3 | 2012-07-08 | 542380 | 1,813.00 |
| 8 | 84 | Mary | Saveley | Lyon | France | 4 | 2012-07-08 | 542381 | 670.80 |
| 11 | 76 | Pascale | Cartrain | Charleroi | Belgium | 5 | 2012-07-09 | 542382 | 3,730.00 |
In [46]:
orders.groupby("country",as_index=False)['totalamount'].sum().sort_values("totalamount",ascending=False)
Out[46]:
| country | totalamount | |
|---|---|---|
| 19 | USA | 247,245.08 |
| 8 | Germany | 227,390.63 |
| 1 | Austria | 139,496.63 |
| 3 | Brazil | 99,158.48 |
| 7 | France | 85,498.76 |
| 20 | Venezuela | 60,814.89 |
| 18 | UK | 60,616.51 |
| 16 | Sweden | 59,523.70 |
| 9 | Ireland | 57,317.39 |
| 4 | Canada | 55,334.10 |
| 2 | Belgium | 35,134.98 |
| 5 | Denmark | 34,782.25 |
| 17 | Switzerland | 32,919.50 |
| 11 | Mexico | 24,073.45 |
| 6 | Finland | 19,778.45 |
| 15 | Spain | 19,431.89 |
| 10 | Italy | 16,705.15 |
| 14 | Portugal | 12,468.65 |
| 0 | Argentina | 8,119.10 |
| 12 | Norway | 5,735.15 |
| 13 | Poland | 3,531.95 |
In [48]:
# plots funtion
def plots(coulumname, top10=False):
prop=orders.groupby(coulumname ,as_index=False)["totalamount"].sum()
prop["proporiton"]=(prop.totalamount / orders.totalamount.sum()) * 100
prop=prop.sort_values("proporiton" , ascending=False)
print("describe of totalamount ", prop.totalamount.describe())
if top10 == True :
prop = prop.head(10)
fig1= px.bar(data_frame=prop , x=coulumname ,y="totalamount")
fig2= px.pie(data_frame=prop , names=coulumname , values="totalamount")
fig2.show()
fig1.show()
country vs totalamount¶
In [50]:
plots("country")
describe of totalamount count 21.00 mean 62,146.51 std 67,460.19 min 3,531.95 25% 19,431.89 50% 35,134.98 75% 60,814.89 max 247,245.08 Name: totalamount, dtype: float64
نلاحظ ان امريكا هيا الاعلي شراء بنسبه 18.1% و بمبلغ 247,254 الف
city vs totalamount top 10¶
In [52]:
plots("city", True)
describe of totalamount count 69.00 mean 18,914.15 std 22,692.99 min 357.00 25% 4,788.06 50% 11,830.10 75% 23,850.95 max 113,236.68 Name: totalamount, dtype: float64
نلاحظ ان:
اعلي مدينه اشترت بضاعه هيا جراز بنسبه 18.1% بمجموع مبيعات 113.236 بينما لاندن تظهر 5 في المشتريات بنسبه 8.71 بمبلغ مشتريات 54.4702 الف بينما كنت هيا الاوله في عدد الفوتير
In [54]:
orders.head()
Out[54]:
| customerid | firstname | lastname | city | country | orderid | orderdate | ordernumber | totalamount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 85 | Paul | Henriot | Reims | France | 1 | 2012-07-04 | 542378 | 440.00 |
| 3 | 79 | Karin | Josephs | Münster | Germany | 2 | 2012-07-05 | 542379 | 1,863.40 |
| 5 | 34 | Mario | Pontes | Rio de Janeiro | Brazil | 3 | 2012-07-08 | 542380 | 1,813.00 |
| 8 | 84 | Mary | Saveley | Lyon | France | 4 | 2012-07-08 | 542381 | 670.80 |
| 11 | 76 | Pascale | Cartrain | Charleroi | Belgium | 5 | 2012-07-09 | 542382 | 3,730.00 |
In [58]:
px.line(data_frame = date_total , y="totalamount" , x="orderdate")
C:\Users\dell\anaconda3\Lib\site-packages\_plotly_utils\basevalidators.py:106: FutureWarning: The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result
customers vs totalamount¶
In [61]:
orders.head(5)
Out[61]:
| customerid | firstname | lastname | city | country | orderid | orderdate | ordernumber | totalamount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 85 | Paul | Henriot | Reims | France | 1 | 2012-07-04 | 542378 | 440.00 |
| 3 | 79 | Karin | Josephs | Münster | Germany | 2 | 2012-07-05 | 542379 | 1,863.40 |
| 5 | 34 | Mario | Pontes | Rio de Janeiro | Brazil | 3 | 2012-07-08 | 542380 | 1,813.00 |
| 8 | 84 | Mary | Saveley | Lyon | France | 4 | 2012-07-08 | 542381 | 670.80 |
| 11 | 76 | Pascale | Cartrain | Charleroi | Belgium | 5 | 2012-07-09 | 542382 | 3,730.00 |
In [63]:
cust_orders={
"full_name": orders.firstname + " " + orders.lastname,
"totalamount": orders.totalamount
}
cust_orders=pd.DataFrame(cust_orders)
cust_orders.head(10)
Out[63]:
| full_name | totalamount | |
|---|---|---|
| 0 | Paul Henriot | 440.00 |
| 3 | Karin Josephs | 1,863.40 |
| 5 | Mario Pontes | 1,813.00 |
| 8 | Mary Saveley | 670.80 |
| 11 | Pascale Cartrain | 3,730.00 |
| 14 | Mario Pontes | 1,444.80 |
| 17 | Yang Wang | 625.20 |
| 20 | Michael Holz | 2,490.50 |
| 24 | Paula Parente | 517.80 |
| 26 | Carlos Hern?ndez | 1,119.90 |
In [65]:
#top 10 cust count orders
fig=px.bar(cust_orders.groupby("full_name")["totalamount"].count().sort_values(ascending=False).head(10),text_auto=True ,
labels={"full_name":"full_name","value":"frequency" })
In [67]:
fig.update_traces(marker_color= "rgb(158,202,225)" ,marker_line_color="rgb(8,84,107)",opacity=0.6,textposition="outside")
In [69]:
#top sum cust orders
fig=px.bar(cust_orders.groupby("full_name")["totalamount"].sum().sort_values(ascending=False).head(10),text_auto=True ,
labels={"full_name":"full_name","value":"frequency" })
In [71]:
fig.update_traces(marker_color= "rgb(158,250,255)" ,marker_line_color="rgb(8,90,107)",opacity=0.6,textposition="inside")
multivariate analysis¶
In [74]:
orders.head()
Out[74]:
| customerid | firstname | lastname | city | country | orderid | orderdate | ordernumber | totalamount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 85 | Paul | Henriot | Reims | France | 1 | 2012-07-04 | 542378 | 440.00 |
| 3 | 79 | Karin | Josephs | Münster | Germany | 2 | 2012-07-05 | 542379 | 1,863.40 |
| 5 | 34 | Mario | Pontes | Rio de Janeiro | Brazil | 3 | 2012-07-08 | 542380 | 1,813.00 |
| 8 | 84 | Mary | Saveley | Lyon | France | 4 | 2012-07-08 | 542381 | 670.80 |
| 11 | 76 | Pascale | Cartrain | Charleroi | Belgium | 5 | 2012-07-09 | 542382 | 3,730.00 |
country with orderdate and totalamount
we interested in nowing the number of countries and the totala amount of them month¶
In [78]:
multivar = orders[["orderdate" , "country" , "totalamount"]]
multivar = multivar.set_index("orderdate")
multivar.head()
Out[78]:
| country | totalamount | |
|---|---|---|
| orderdate | ||
| 2012-07-04 | France | 440.00 |
| 2012-07-05 | Germany | 1,863.40 |
| 2012-07-08 | Brazil | 1,813.00 |
| 2012-07-08 | France | 670.80 |
| 2012-07-09 | Belgium | 3,730.00 |
In [80]:
multivar = multivar.resample("1M").agg({'country': "nunique" ,"totalamount":"sum"}) #aggrygat funciton()
multivar.head()
Out[80]:
| country | totalamount | |
|---|---|---|
| orderdate | ||
| 2012-07-31 | 11 | 30,192.10 |
| 2012-08-31 | 11 | 26,609.40 |
| 2012-09-30 | 11 | 27,636.00 |
| 2012-10-31 | 13 | 41,203.60 |
| 2012-11-30 | 11 | 49,704.00 |
In [82]:
fig = px.bar (data_frame=multivar,x = multivar.index , y=round(multivar.totalamount,0), color =multivar.country ,text_auto=True , title="grand total amount of all countries per month" , labels={"orderdate":"order date" ,"y" :"totalamount"})
C:\Users\dell\anaconda3\Lib\site-packages\_plotly_utils\basevalidators.py:106: FutureWarning: The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result
In [84]:
fig
In [ ]: